Boru Chen
%matplotlib inline
import os
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import numpy as np
pd.set_option('display.max_columns', None)
raw = pd.read_csv("./StrayAnimalsAnalysis/clean1.csv")
# # Drop data without image
# filename = list(map(lambda x: str(x) + "_front_org.jpg", raw["KeyNo"]))
# img_root = "./images"
# without_img = []
# for path in filename:
# if(not os.path.exists(os.path.join(img_root, path))):
# without_img.append(path.split('_')[0])
raw.set_index("KeyNo" , inplace=True)
# raw.drop(list(map(lambda x: int(x),without_img)), inplace=True)
y = raw["Adopted"]
df = raw.drop(columns=["Adopted", "StayDays", "_Situation2Name", "StatusName"])
df_ohe = pd.get_dummies(df)
X_train, X_test, y_train, y_test = train_test_split(df, y,test_size=0.1, random_state=42)
X_train_ohe, X_test_ohe, y_train, y_test = train_test_split(df_ohe, y,test_size=0.1, random_state=42)
raw.head()
X_train.head()
print("Training Dataset")
print("nrow:", len(X_train))
print("label")
print("1:", sum(y_train))
print("0:", sum(1-y_train))
print()
print("Test Dataset")
print("nrow:", len(X_test))
print("label")
print("1:", sum(y_test))
print("0:", sum(1-y_test))
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
scaler = StandardScaler().fit(X_train_ohe)
X_train_std = scaler.transform(X_train_ohe)
X_test_std = scaler.transform(X_test_ohe)
pca = PCA(n_components = 0.8).fit(X_train_std)
X_train_pca = pca.transform(X_train_std)
X_test_pca = pca.transform(X_test_std)
from sklearn.linear_model import LogisticRegressionCV
LR_L2 = LogisticRegressionCV(cv=5, random_state=0, solver='saga', max_iter = 10000, n_jobs = -1).fit(X_train_pca, y_train)
LR_L2.predict(X_test_pca)
from sklearn.externals import joblib
joblib.dump(LR_L2, './saved_model/LR_L2.joblib')
LR_L2 = joblib.load('saved_model/LR_L2.joblib')
LR_L2_predict_prob = LR_L2.predict_proba(X_test_pca)[:,1]
LR_L2_predict = LR_L2.predict(X_test_pca)
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, f1_score
def plot_roc_curve(fpr, tpr):
plt.plot(fpr, tpr, color='orange', label='ROC')
plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend()
plt.show()
fpr, tpr, thresholds = roc_curve(y_test, LR_L2_predict_prob)
plot_roc_curve(fpr, tpr)
roc_auc_score(y_test, LR_L2_predict_prob)
# accuracy_score(y_test, LR_L2_predict)
from sklearn.ensemble import RandomForestClassifier
RFC = RandomForestClassifier().fit(X_train_ohe, y_train)
RFC_predict_prob = RFC.predict_proba(X_test_ohe)[:, 1]
RFC_predict = RFC.predict(X_test_ohe)
fpr, tpr, thresholds = roc_curve(y_test, RFC_predict_prob)
plot_roc_curve(fpr, tpr)
roc_auc_score(y_test, RFC_predict_prob)
r = np.power(len(X_train_std)/4, 1/10)
k_range = [1]
for i in range(1,10):
k_range.append(k_range[i-1]*r)
k_range = np.array(list(map(round, k_range))).astype('int')
k_range
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
k_scores = []
for k in k_range:
knn = KNeighborsClassifier(n_neighbors=k)
scores = cross_val_score(knn, X_train_std, y_train, cv=5, scoring='accuracy')
k_scores.append(scores.mean())
# save the result
joblib.dump(k_scores, './saved_model/k_scores.joblib')
k_scores = joblib.load('saved_model/k_scores.joblib')
best_k = k_range[np.argmax(k_scores)]
print("max: accuracy", max(k_scores))
print("Set hyperparameter k = ", best_k)
print()
# plot to see clearly
plt.plot(k_range, k_scores)
plt.xlabel('Value of k for kNN')
plt.ylabel('Cross-Validated Accuracy')
plt.title('Accuracy with respect to different k')
plt.savefig("kNN_k.png")
plt.show()
from sklearn.neighbors import KNeighborsClassifier
kNN = KNeighborsClassifier(n_neighbors = best_k)
kNN.fit(X_train_std, y_train)
kNN_predict_prob = kNN.predict_proba(X_test_std)[:, 1]
kNN_predict = kNN.predict(X_test_std)
fpr, tpr, thresholds = roc_curve(y_test, kNN_predict_prob)
plot_roc_curve(fpr, tpr)
roc_auc_score(y_test, kNN_predict_prob)
from sklearn.neural_network import MLPClassifier
MLP = MLPClassifier().fit(X_train_std, y_train)
MLP_predict_prob = MLP.predict_proba(X_test_std)[:, 1]
MLP_predict = MLP.predict(X_test_std)
fpr, tpr, thresholds = roc_curve(y_test, MLP_predict_prob)
plot_roc_curve(fpr, tpr)
roc_auc_score(y_test, MLP_predict_prob)
from sklearn.ensemble import AdaBoostClassifier
AdaBoost = AdaBoostClassifier().fit(X_train_ohe, y_train)
AdaBoost_predict_prob = AdaBoost.predict_proba(X_test_ohe)[:, 1]
AdaBoost_predict = AdaBoost.predict(X_test_ohe)
fpr, tpr, thresholds = roc_curve(y_test, AdaBoost_predict_prob)
plot_roc_curve(fpr, tpr)
roc_auc_score(y_test, AdaBoost_predict_prob)
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
LDA = LinearDiscriminantAnalysis().fit(X_train_std, y_train)
LDA_predict_prob = LDA.predict_proba(X_test_std)[:, 1]
LDA_predict = LDA.predict(X_test_std)
fpr, tpr, thresholds = roc_curve(y_test, LDA_predict_prob)
plot_roc_curve(fpr, tpr)
roc_auc_score(y_test, LDA_predict_prob)
predict = [LR_L2_predict, kNN_predict, LDA_predict, AdaBoost_predict, RFC_predict, MLP_predict]
predict_prob = [LR_L2_predict_prob, kNN_predict_prob, LDA_predict_prob, AdaBoost_predict_prob, RFC_predict_prob, MLP_predict_prob]
f1 = []
for i in predict:
f1.append(f1_score(y_test, i))
auc = []
for i in predict_prob:
auc.append(roc_auc_score(y_test, i))
f1_round = list(map(lambda x: round(x,4), f1))
auc_round = list(map(lambda x: round(x,4), auc))
print(f1_round)
print(auc_round)
fig = plt.figure(figsize = (6,6), facecolor = 'white', dpi = 300)
ax = fig.add_subplot(111)
ax.set_xlabel('X1', fontsize = 12)
ax.set_ylabel('X2', fontsize = 12)
ax.set_title('ROC', fontsize = 14)
ax.plot([0, 1], [0, 1], color='darkgray', linestyle='--')
colors = plt.cm.rainbow(np.linspace(0,1,6))
labels = ["Logistic Regression", "kNN", "LDA", "AdaBoost", "Random Forest", "MLP"]
for i in range(6):
fpr, tpr, thresholds = roc_curve(y_test, predict_prob[i])
ax.plot(fpr, tpr, color = colors[i], label = labels[i], alpha = 0.6)
ax.legend()
fig.savefig("ROC.png", bbox_inches = 'tight')
# plt.plot(fpr, tpr, color='orange', label='ROC')
# plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
# plt.xlabel('False Positive Rate')
# plt.ylabel('True Positive Rate')
# plt.title('Receiver Operating Characteristic (ROC) Curve')
# plt.legend()
# plt.show()
We propose t-SNE here
from sklearn.manifold import TSNE
#Cosine metric
tsne_cos = TSNE(n_components=2, metric = 'cosine', random_state = 42)
df_tsne_cos = pd.DataFrame(tsne_cos.fit_transform(df_ohe), columns = ["X1", "X2"])
# joblib.dump(df_tsne_cos, './saved_model/df_tsne_cos.joblib')
df_tsne_cos = joblib.load('saved_model/df_tsne_cos.joblib')
# df_tsne_cos.reset_index(inplace = True)
df_tsne_cos["y"] = list(y)
fig = plt.figure(figsize = (21,30), facecolor = 'white', dpi = 300)
ax1 = fig.add_subplot(311)
ax1.set_xlabel('X1', fontsize = 15)
ax1.set_ylabel('X2', fontsize = 15)
ax1.set_title('t-SNE with Cosine Metric', fontsize = 20)
ax1.scatter(df_tsne_cos["X1"][df_tsne_cos["y"] == 1]
, df_tsne_cos["X2"][df_tsne_cos["y"] == 1]
, color = "orange"
, alpha = 0.6
, label = "0")
ax1.scatter(df_tsne_cos["X1"][df_tsne_cos["y"] == 0]
, df_tsne_cos["X2"][df_tsne_cos["y"] == 0]
, color = "green"
, alpha = 0.6
, label = "1")
ax1.set_xlim(-75, 75)
ax1.legend(fontsize = 14)
ax1.grid()